Introduction:
The aim of this notebook is to explore the large dataset, deal with missig values & replace them adequately, make a good exploration of the data and forecast future energy consumption with Time Series approach.
The dataset contains information about energy consumption of a family house located in Northen France, for a period of 47 months.
Data Set Information:
This archive contains 2075259 measurements gathered in a house located in Sceaux (7km of Paris, France) between December 2006 and November 2010 (47 months). Notes:
(global_active_power*1000/60 - sub_metering_1 - sub_metering_2 - sub_metering_3) represents the active energy consumed every minute (in watt hour) in the household by electrical equipment not measured in sub-meterings 1, 2 and 3.
The dataset contains some missing values in the measurements (nearly 1,25% of the rows). All calendar timestamps are present in the dataset but for some timestamps, the measurement values are missing: a missing value is represented by the absence of value between two consecutive semi-colon attribute separators. For instance, the dataset shows missing values on April 28, 2007.
Attribute Information:
import numpy as np
import pandas as pd
from matplotlib import pyplot
import warnings
warnings.filterwarnings('ignore')
from numpy import nan as NA
from pandas import read_csv
from pandas.tools.plotting import autocorrelation_plot
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import calmap
import datetime
from math import sqrt
from pyramid.arima import auto_arima
import statsmodels.api as sm
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.seasonal import seasonal_decompose
dataset = read_csv('household_power_consumption.txt', sep=';', header=0,
low_memory=False, infer_datetime_format=True,
parse_dates={'datetime':[0,1]}, index_col=['datetime'])
# summarize
print(dataset.shape)
print(dataset.head())
dataset.replace('?', NA, inplace=True)
# add a column for for the remainder of sub metering
values = dataset.values.astype('float32')
dataset['sub_metering_4'] = (values[:,0] * 1000 / 60) - (values[:,4] + values[:,5] + values[:,6])
# save updated dataset
#dataset.to_csv('household_power_consumption.csv')
# load the new dataset and summarize
dataset = read_csv('household_power_consumption.csv', header=0, infer_datetime_format=True, parse_dates=['datetime'], index_col=['datetime'])
print(dataset.head())
# line plot for each variable
pyplot.figure(figsize=(15,15))
for i in range(len(dataset.columns)):
pyplot.subplot(len(dataset.columns), 1, i+1)
name = dataset.columns[i]
pyplot.plot(dataset[name])
pyplot.title(name, y=0)
pyplot.show()
# plot active power for each year
years = ['2007', '2008', '2009', '2010']
pyplot.figure(figsize=(15,15))
for i in range(len(years)):
ax = pyplot.subplot(len(years), 1, i+1)
year = years[i]
result = dataset[str(year)]
pyplot.plot(result['Global_active_power'])
pyplot.title(str(year), y=0, loc='left')
pyplot.show()
# plot voltage for each year
years = ['2007', '2008', '2009', '2010']
pyplot.figure(figsize=(15,15))
for i in range(len(years)):
ax = pyplot.subplot(len(years), 1, i+1)
year = years[i]
result = dataset[str(year)]
pyplot.plot(result['Voltage'])
pyplot.title(str(year), y=0, loc='left')
pyplot.show()
# plot active power in a given year
months = [x for x in range(1, 13)]
pyplot.figure(figsize=(15,25))
for i in range(len(months)):
ax = pyplot.subplot(len(months), 1, i+1)
month = '2007-' + str(months[i])
result = dataset[month]
pyplot.plot(result['Global_active_power'])
pyplot.title(month, y=0, loc='left')
pyplot.show()
# histogram plot for each variable
pyplot.figure(figsize=(15,15))
for i in range(len(dataset.columns)):
pyplot.subplot(len(dataset.columns), 1, i+1)
name = dataset.columns[i]
dataset[name].hist(bins=100)
pyplot.title(name, y=0)
pyplot.show()
clean_df = dataset.dropna(axis=0,how='any')
clean_df.head()
x = clean_df.values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
df = pd.DataFrame(x_scaled)
df.index = clean_df.index
df.columns = clean_df.columns
df.head()
#Plot a day in march, june, sept, december of 2008
pyplot.figure(figsize=(20,12))
months = ['03', '06', '09', '12']
variables = ['Global_active_power', 'Sub_metering_1', 'Sub_metering_2','Sub_metering_3']
for i,j in zip(range(4), months):
for k in variables:
pyplot.subplot(4,1, i+1)
pyplot.plot(df['2008-'+ j +'-15'][k])
pyplot.legend(loc=2)
# plot Christmas for each year
years = ['2006', '2007', '2008', '2009']
variables = ['Global_active_power', 'Sub_metering_1', 'Sub_metering_2','Sub_metering_3', 'sub_metering_4']
pyplot.figure(figsize=(20,12))
for i in range(len(years)):
for j in variables:
pyplot.subplot(len(years), 1, i+1)
pyplot.plot(df[years[i] + '-12-25'][j])
pyplot.title(years[i], y=0, loc='left')
pyplot.legend(loc=2)
# plot two days after Christmas for each year
years = ['2006', '2007', '2008', '2009']
variables = ['Global_active_power', 'Sub_metering_1', 'Sub_metering_2','Sub_metering_3', 'sub_metering_4']
pyplot.figure(figsize=(20,12))
for i in range(len(years)):
for j in variables:
pyplot.subplot(len(years), 1, i+1)
pyplot.plot(df[years[i] + '-12-27'][j])
pyplot.title(years[i], y=0, loc='left')
pyplot.legend(loc=2)
# plot 14th july Bastille day for each year
years = ['2007', '2008', '2009', '2010']
variables = ['Global_active_power', 'Sub_metering_1', 'Sub_metering_2','Sub_metering_3', 'sub_metering_4']
pyplot.figure(figsize=(20,12))
for i in range(len(years)):
for j in variables:
pyplot.subplot(len(years), 1, i+1)
pyplot.plot(df[years[i] + '-07-14'][j])
pyplot.title(years[i], y=0, loc='left')
pyplot.legend(loc=2)
day_mean_df = dataset.resample('D').mean()
week_mean_df = dataset.resample('W').mean()
month_mean_df = dataset.resample('M').mean()
df = [day_mean_df, week_mean_df, month_mean_df]
pyplot.figure(figsize=(20,20))
for i in range(len(df)):
pyplot.subplot(len(df),1,i+1)
df_names = ['Day Mean', 'Week Mean', 'Month Mean']
pyplot.title(df_names[i])
dataframe = df[i]
names = []
for j in [0,4,5,6,7]:
pyplot.plot(dataframe.iloc[:,[j]])
names.append(dataset.columns[j])
pyplot.legend(names)
years = ['2006', '2007', '2008', '2009', '2010']
pyplot.figure(figsize=(20,20))
for i in range(len(years)):
pyplot.subplot(len(years), 1, i+1)
pyplot.title(years[i] + ' Week Mean')
year = years[int(i)]
result = week_mean_df[year]
names = []
pyplot.title(years[i], y=0, loc='left')
for j in [0,4,5,6,7]:
pyplot.plot(result.iloc[:,[j]])
names.append(result.columns[j])
pyplot.legend(names, loc=2)
years = ['2007', '2008', '2009', '2010']
pyplot.figure(figsize=(20,20))
for i in range(len(years)):
pyplot.subplot(len(years), 1, i+1)
pyplot.title(years[i] + ' Month Mean')
year = years[i]
result = month_mean_df[year]
names = []
pyplot.title(years[i], y=0, loc='left')
for j in [0,4,5,6,7]:
pyplot.plot(result.iloc[:,[j]])
names.append(result.columns[j])
pyplot.legend(names, loc=2)
dataset.isna().values.any()
na_10 = pd.DataFrame(dataset['Global_active_power'].isna().map({False:0, True:1}))
na_10.head()
pyplot.figure(figsize=(30,20))
pyplot.suptitle('NaN Distribution', fontsize=24)
years = ['2006','2007', '2008', '2009', '2010']
for i in range(len(years)):
pyplot.subplot(len(years), 1, i+1)
pyplot.title(years[i], y=0, loc='left', fontsize=20)
pyplot.plot(na_10[years[i]])
na_sum = na_10.resample('D').sum()
#na_sum['sum_log'] = np.log(na_sum.Global_active_power)
na_sum['sum_div'] = na_sum.Global_active_power/500
pyplot.figure(figsize=(20,5))
na_sum.Global_active_power.plot()
pyplot.figure(figsize=(20,20))
years = ['2006','2007', '2008', '2009', '2010']
for i in range(len(years)):
pyplot.subplot(5,1,i+1)
pyplot.title(years[i], y=0, loc='left', fontsize=20)
calmap.yearplot(na_sum.sum_div, year=int(years[i]), vmax=.03, vmin=0,
fillcolor='grey', linewidth=2, cmap='Reds')
dataset[dataset.isnull().any(axis=1)].head()
def find_distr(df, hour, col):
all_hour_mean = np.abs(round(df.iloc[:,col].resample('H').mean(),1))
hist = all_hour_mean[all_hour_mean.index.hour == hour]
#hist.hist(bins=50)
vals = pd.value_counts(hist).index.tolist()
freq = pd.value_counts(hist).tolist()
prob = [i/round(sum(vals),1) for i in vals]
pick = np.random.choice(vals, 1, p=prob)
return float(pick)
def fill_missing(df):
df1 = df.resample('H').mean()
for row in range(df1.shape[0]):
for col in range(df1.shape[1]):
if np.isnan(df1.iloc[row, col]):
hour = dataset.index[row].hour
df1.iloc[row, col] = find_distr(df1, hour, col)
return df1
try_df = dataset.iloc[:,:]
try_df.isna().values.any()
clean_df = fill_missing(try_df)
clean_df.isna().values.any()
#clean_df.to_csv('clean_household_power_consumption.csv')
pyplot.figure(figsize=(20,10))
pyplot.subplot(3,1,1)
clean_df['2010-01-01':'2010-01-31']['Global_active_power'].resample('H').mean().plot()
dataset['2010-01-01':'2010-01-31']['Global_active_power'].resample('H').mean().plot()
pyplot.title('2010-01-01 to 2010-01-31 Global_active_power', y=1)
pyplot.legend(['Missing random guess', 'Real data'], loc=2)
pyplot.subplot(3,1,2)
clean_df['2010-08-01':'2010-08-31']['Global_active_power'].resample('H').mean().plot()
dataset['2010-08-01':'2010-08-31']['Global_active_power'].resample('H').mean().plot()
pyplot.title('2010-08-01 to 2010-08-31 Global_active_power', y=1)
pyplot.legend(['Missing random guess', 'Real data'], loc=2)
pyplot.subplot(3,1,3)
clean_df['2010-09-15':'2010-10-15']['Global_active_power'].resample('H').mean().plot()
dataset['2010-09-15':'2010-10-15']['Global_active_power'].resample('H').mean().plot()
pyplot.title('2010-09-15 to 2010-10-15 Global_active_power', y=1)
pyplot.legend(['Missing random guess', 'Real data'], loc=2)
pyplot.figure(figsize=(20,10))
pyplot.subplot(3,1,1)
clean_df['2010-01-01':'2010-01-31']['Voltage'].resample('H').mean().plot()
dataset['2010-01-01':'2010-01-31']['Voltage'].resample('H').mean().plot()
pyplot.title('2010-01-01 to 2010-01-31 Voltage', y=1)
pyplot.legend(['Missing random guess', 'Real data'], loc=2)
pyplot.subplot(3,1,2)
clean_df['2010-08-01':'2010-08-31']['Voltage'].resample('H').mean().plot()
dataset['2010-08-01':'2010-08-31']['Voltage'].resample('H').mean().plot()
pyplot.title('2010-08-01 to 2010-08-31 Voltage', y=1)
pyplot.legend(['Missing random guess', 'Real data'], loc=2)
pyplot.subplot(3,1,3)
clean_df['2010-09-15':'2010-10-15']['Voltage'].resample('H').mean().plot()
dataset['2010-09-15':'2010-10-15']['Voltage'].resample('H').mean().plot()
pyplot.title('2010-09-15 to 2010-10-15 Voltage', y=1)
pyplot.legend(['Missing random guess', 'Real data'], loc=2)
ts_week = clean_df['Global_active_power'].resample('W').mean()
ts_month = clean_df['Global_active_power'].resample('M').mean()
ts_week['2006']
ts_week.plot(kind='kde')
pyplot.figure(figsize=(20,5))
pyplot.plot(ts_week)
pyplot.plot(ts_month)
pyplot.legend(['Global_active_power (Week)', 'Global_active_power (Month)'], loc=1)
def test_stationarity(timeseries):
pyplot.figure(figsize=(20,5))
#Determing rolling statistics
rolmean = pd.rolling_mean(timeseries, window=12)
rolstd = pd.rolling_std(timeseries, window=12)
#Plot rolling statistics:
orig = pyplot.plot(timeseries, color='blue',label='Original')
mean = pyplot.plot(rolmean, color='red', label='Rolling Mean')
std = pyplot.plot(rolstd, color='black', label = 'Rolling Std')
pyplot.legend(loc='best')
pyplot.title('Rolling Mean & Standard Deviation')
pyplot.show(block=False)
#Perform Dickey-Fuller test:
print ('Results of Dickey-Fuller Test:')
dftest = adfuller(timeseries, autolag='AIC')
dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
for key,value in dftest[4].items():
dfoutput['Critical Value (%s)'%key] = value
print (dfoutput)
test_stationarity(ts_week)
test_stationarity(ts_month)
def decomposition(timeseries):
decomposition = seasonal_decompose(timeseries)
trend = decomposition.trend
seasonal = decomposition.seasonal
residual = decomposition.resid
pyplot.figure(figsize=(15,8))
pyplot.subplot(4,1,1)
pyplot.plot(timeseries, label='Original')
pyplot.legend(loc=1)
pyplot.subplot(4,1,2)
pyplot.plot(trend, label='Trend')
pyplot.legend(loc=1)
pyplot.subplot(4,1,3)
pyplot.plot(seasonal,label='Seasonality')
pyplot.legend(loc=1)
pyplot.subplot(4,1,4)
pyplot.plot(residual, label='Residuals')
pyplot.legend(loc=1)
pyplot.tight_layout()
decomposition(ts_week)
v = sm.tsa.seasonal_decompose(ts_week)
v.plot()
result = sm.tsa.stattools.adfuller(ts_week)
pyplot.show()
substracted = ts_week - v.seasonal
decomposition(ts_month)
pyplot.figure(figsize=(15,5))
autocorrelation_plot(ts_week)
pyplot.figure(figsize=(15,5))
autocorrelation_plot(ts_month)
X = pd.DataFrame(ts_week)
train_size = int(len(X) * 0.7)
train = X.iloc[:train_size]
test = X.iloc[train_size:]
print('Observations: %d' % (len(X)))
print('Training Observations: %d' % (len(train)))
print('Testing Observations: %d' % (len(test)))
model = auto_arima(train, trace=True,
seasonal=True, m=52,
error_action='ignore', suppress_warnings=True)
model = model.fit(train)
week_forecast = model.predict(n_periods=len(test))
week_forecast = pd.DataFrame(week_forecast,index = test.index,columns=['Prediction'])
#plot the predictions for validation set
pyplot.figure(figsize=(15,5))
pyplot.plot(train)
pyplot.plot(test)
pyplot.plot(week_forecast)
pyplot.legend(['Train','Test','Weekly AutoArima Forecast'], loc=1)
pyplot.show()
rms = sqrt(mean_squared_error(test,week_forecast))
print('RMSE %f' % rms)
print(model.summary())
model2 = auto_arima(X, trace=True,
seasonal=True, m=52,
error_action='ignore', suppress_warnings=True)
model2 = model2.fit(X)
nex = [test.index[-1] + datetime.timedelta(weeks=i) for i in range(53)]
week_forecast2 = model2.predict(n_periods=len(nex))
week_forecast2 = pd.Series(week_forecast2,index = nex)
X = pd.DataFrame(ts_month)
train_size = int(len(X) * 0.7)
train = X.iloc[:train_size]
test = X.iloc[train_size:]
print('Observations: %d' % (len(X)))
print('Training Observations: %d' % (len(train)))
print('Testing Observations: %d' % (len(test)))
model = auto_arima(train, trace=True,
seasonal=True, m=12,
error_action='ignore', suppress_warnings=True)
model.fit(train)
month_forecast = model.predict(n_periods=len(test))
month_forecast = pd.DataFrame(month_forecast,index = test.index,columns=['Prediction'])
#plot the predictions for validation set
pyplot.figure(figsize=(15,5))
pyplot.plot(train)
pyplot.plot(test)
pyplot.plot(month_forecast)
pyplot.legend(['Train','Test','Monthly AutoArima Forecast'], loc=1)
pyplot.show()
rms = sqrt(mean_squared_error(test,month_forecast))
print('RMSE %f' % rms)
print(model.summary())
model2 = auto_arima(X, trace=True,
seasonal=True, m=12,
error_action='ignore', suppress_warnings=True)
model2 = model2.fit(X)
nex_m = [test.index[-1] + i for i in range(12)]
month_forecast2 = model2.predict(n_periods=len(nex_m))
month_forecast2 = pd.Series(month_forecast2,index = nex_m)
X = pd.DataFrame(ts_week)
train_size = int(len(X) * 0.7)
train = X.iloc[:train_size]
test = X.iloc[train_size:]
model = ExponentialSmoothing(train, seasonal='add', seasonal_periods=52).fit()
pred = model.predict(start=test.index[0], end=test.index[-1])
pyplot.figure(figsize=(15,5))
pyplot.plot(train.index, train, label='Train')
pyplot.plot(test.index, test, label='Test')
pyplot.plot(pred.index, pred, label='Weekly Holt-Winters')
pyplot.legend(loc=1)
rms = sqrt(mean_squared_error(test,pred))
print('RMSE %f' % rms)
model2 = ExponentialSmoothing(X, seasonal='add', seasonal_periods=52).fit()
pred2 = model2.predict(start=nex[0], end=nex[-1])
week_forecast2_HW = pd.Series(pred2,index = nex)
X = pd.DataFrame(ts_month)
train_size = int(len(X) * 0.7)
train = X.iloc[:train_size]
test = X.iloc[train_size:]
model = ExponentialSmoothing(train, seasonal='add', seasonal_periods=12).fit()
pred = model.predict(start=test.index[0], end=test.index[-1])
pyplot.figure(figsize=(15,5))
pyplot.plot(train.index, train, label='Train')
pyplot.plot(test.index, test, label='Test')
pyplot.plot(pred.index, pred, label='Montly Holt-Winters')
pyplot.legend(loc=1)
rms = sqrt(mean_squared_error(test,pred))
print('RMSE %f' % rms)
print(model.summary())
model2 = ExponentialSmoothing(X, seasonal='add', seasonal_periods=12).fit()
pred2 = model2.predict(start=nex_m[0], end=nex_m[-1])
month_forecast2_HW = pd.Series(pred2,index = nex_m)
X = pd.DataFrame(ts_week)
train_size = int(len(X) * 0.7)
train = X.iloc[:train_size]
test = X.iloc[train_size:]
my_order = (1, 0, 0)
my_seasonal_order = (1, 1, 0, 52)
model = SARIMAX(train, order=my_order, seasonal_order=my_seasonal_order).fit()
predictions_sarimax = model.get_forecast(steps=round(len(test)))
pred = predictions_sarimax.summary_frame()
model2 = SARIMAX(X, order=my_order, seasonal_order=my_seasonal_order).fit()
predictions_sarimax2 = model2.get_forecast(steps=52)
pred2 = predictions_sarimax2.summary_frame()
#pred = model.predict(start=test.index[0], end=test.index[-1])
#pred = pd.DataFrame(pred,index = test.index,columns=['Prediction'])
pyplot.figure(figsize=(25,5))
pyplot.plot(train.index, train, label='Train')
pyplot.plot(test.index, test, label='Test')
pyplot.plot(pred.index, pred['mean'].values, label='Weekly SARIMAX')
pyplot.fill_between(pred.index,
pred['mean_ci_lower'],
pred['mean_ci_upper'],
color='green', alpha=.1
)
pyplot.plot(pred2.index, pred2['mean'].values, color = 'red',label='Forecast Weekly SARIMAX')
pyplot.fill_between(pred2.index,
pred2['mean_ci_lower'],
pred2['mean_ci_upper'],
color='red', alpha=.1
)
pyplot.legend(loc=1)
pyplot.figure(figsize=(25,5))
pyplot.plot(ts_week.index, ts_week, label='Original')
pyplot.plot(pred2.index, pred2['mean'].values, color = 'red',label='Forecast Weekly SARIMAX')
pyplot.fill_between(pred2.index,
pred2['mean_ci_lower'],
pred2['mean_ci_upper'],
color='red', alpha=.1
)
pyplot.legend(loc=1)
week_sarimax = pd.Series(pred2['mean'].values, index=pred2.index)
model2.plot_diagnostics(figsize=(10,8))
X = pd.DataFrame(ts_month)
train_size = int(len(X) * 0.7)
train = X.iloc[:train_size]
test = X.iloc[train_size:]
my_order = (1, 0, 0)
my_seasonal_order = (1, 1, 0, 12)
model = SARIMAX(train, order=my_order, seasonal_order=my_seasonal_order).fit()
predictions_sarimax = model.get_forecast(steps=round(len(test)))
pred = predictions_sarimax.summary_frame()
model2 = SARIMAX(X, order=my_order, seasonal_order=my_seasonal_order).fit()
predictions_sarimax2 = model2.get_forecast(steps=12)
pred2 = predictions_sarimax2.summary_frame()
#pred = model.predict(start=test.index[0], end=test.index[-1])
#pred = pd.DataFrame(pred,index = test.index,columns=['Prediction'])
pyplot.figure(figsize=(25,5))
pyplot.plot(train.index, train, label='Train')
pyplot.plot(test.index, test, label='Test')
pyplot.plot(pred.index, pred['mean'].values, label='Monthly SARIMAX')
pyplot.fill_between(pred.index,
pred['mean_ci_lower'],
pred['mean_ci_upper'],
color='green', alpha=.1
)
pyplot.plot(pred2.index, pred2['mean'].values, color = 'red',label='Forecast Monthly SARIMAX')
pyplot.fill_between(pred2.index,
pred2['mean_ci_lower'],
pred2['mean_ci_upper'],
color='red', alpha=.1
)
pyplot.legend(loc=1)
pyplot.figure(figsize=(25,5))
pyplot.plot(ts_month.index, ts_month, label='Original')
pyplot.plot(pred2.index, pred2['mean'].values, color = 'red',label='Forecast Monthly SARIMAX')
pyplot.fill_between(pred2.index,
pred2['mean_ci_lower'],
pred2['mean_ci_upper'],
color='red', alpha=.1
)
pyplot.legend(loc=1)
month_sarimax = pd.Series(pred2['mean'].values, index=pred2.index)
model2.plot_diagnostics(figsize=(10,8))
pyplot.figure(figsize=(15,5))
pyplot.plot(ts_week.index, ts_week, label='Original')
pyplot.plot(week_forecast2, label='Prediction Auto-Arima', color='red')
pyplot.plot(week_forecast2_HW, label='Prediction Holt-Winters', color='green')
pyplot.plot(week_sarimax, label='Prediction SARIMAX', color='orange')
pyplot.legend(loc=1)
month_up = []
for i,j in list(zip(month_forecast2_HW.values, np.arange(1.05, 1.35, (1.35-1.05)/len(month_forecast2_HW.values)))):
month_up.append(i * j)
month_dwn = []
for z,q in list(zip(month_forecast2_HW.values, np.arange(0.95, 0.65, -(0.95-0.65)/len(month_forecast2_HW.values)))):
month_dwn.append(z * q)
pyplot.figure(figsize=(15,5))
pyplot.plot(ts_month.index, ts_month, label='Original')
pyplot.plot(month_forecast2, label='Prediction Auto-Arima', color='red')
pyplot.plot(month_forecast2_HW, label='Prediction Holt-Winters', color='green')
pyplot.fill_between(month_forecast2_HW.index,
month_up,
month_dwn,
alpha=0.1, color='k')
pyplot.plot(month_sarimax, label='Prediction SARIMAX', color='orange')
pyplot.legend(loc=1)
#https://public.tableau.com/views/ENERGY_15536820743440/Dashboard1?:embed=y&:display_count=yes&publish=yes